Note: Please load the workspace in the directory
implementation/R/workspace/preprocessing.RData to run the
following code and re-use the previously created variables. Furthermore,
the following libraries must be installed and loaded:
# intall necessary packages
#install.packages("quanteda")
#install.packages("readtext")
#install.packages("tidyverse")
#install.packages("quanteda.textstats")
#install.packages("quanteda.textplots")
#install.packages("data.table")
#install.packages("stringr")
#install.packages("spacyr")
#install.packages("textcat")
#install.packages("plyr")
# load libraries
library(quanteda)
library(readtext)
library(tidyverse)
library(quanteda.textplots)
library(quanteda.textstats)
library(plyr)
library(dplyr)
#library(stringr)
#library(data.table)
#library(textcat)
To retrieve the direct collocations, i.e. one token to the left or
right of the compound word, we use the kwic function
offered by quanteda. Here we can choose a window of 1 to
make sure we obtain the correct number of collocations.
Let’s have a first look at the collocations for the example
“Klimaleugner” (en: “climate denier”). We are going to retrieve the
collocations to the left (pre) and to the right
(post) of the key word and count their occurrences. Then,
we will output the Top-5 collocations for each category,
i.e. pre and post.
# apply keyword-in-context function for given word
word = "klimaleugner"
# to C2022
kwic_con <- kwic(sp_c2022_tokens, pattern=word, window=1, valuetype="fixed") %>%
as_tibble()
# to P2022
kwic_pro <- kwic(sp_p2022_tokens, pattern=word, window=1, valuetype="fixed") %>%
as_tibble()
Let’s show the Top-5 for the C2022 corpus:
kwic_con %>%
dplyr::count(pre) %>%
arrange(desc(n)) %>%
head(n=5)
kwic_con %>%
dplyr:: count(post) %>%
arrange(desc(n)) %>%
head(n=5)
And the Top-5 for the P2022 corpus:
kwic_pro %>%
dplyr::count(pre) %>%
arrange(desc(n)) %>%
head(n=5)
kwic_pro %>%
dplyr::count(post) %>%
arrange(desc(n)) %>%
head(n=5)
Now we seek to create tables that contain the top 5 pre
and post collocations for each of our compound words.
Firstly, we create a table for the collocations we can obtain from
P2022
# for each compound, get list of top 5 collocations
# initiate empty data frame
pro_colls10 = data.frame()
# for each compound
for (word in compounds){
# get collocations
kwic_pro <- kwic(sp_p2022_tokens, pattern=word, window=1, valuetype="fixed") %>%
as_tibble()
keyword <- word
# retrieve top5 preceding collocations
pro_pre <- kwic_pro %>%
dplyr::count(pre) %>%
arrange(desc(n)) %>%
head(n=5)
# retrieve top5 following collocations
pro_post <- kwic_pro %>%
dplyr::count(post) %>%
arrange(desc(n)) %>%
head(n=5)
# normalize data frames with top5 collocations
pro_pre$keyword <- keyword
pro_pre$tag <- "pre"
names(pro_pre)[names(pro_pre) == 'pre'] <- "word"
pro_post$keyword <- keyword
pro_post$tag <- "post"
names(pro_post)[names(pro_post) == 'post'] <- "word"
pro_colls10 <- rbind(pro_colls10, pro_pre)
pro_colls10 <- rbind(pro_colls10, pro_post)}
Most of the collocations only occur exactly once. Since this is not very informative for us, we remove all the collocations with a count of exactly 1. Also, we want to remove noise, i.e. empty strings from the collocations.
# only keep collocations that appear more than once
top_colls_pro<-pro_colls10[(pro_colls10$n > 1),]
# remove empty strings
top_colls_pro<-top_colls_pro[(top_colls_pro$word > " "),]
And save the table to a csv file.
Then, we create the same table of the top 5 pre and
post collocations for the C2022.
# for each compound, get list of top 5 collocations
# initiate empty data frame
con_colls10 = data.frame()
# for each compound
for (word in compounds){
# get collocations
kwic_con <- kwic(sp_c2022_tokens, pattern=word, window=1, valuetype="fixed") %>%
as_tibble()
#keyword <- kwic_pro$keyword[[1]]
keyword <- word
# retrieve top5 preceding collocations
con_pre <- kwic_con %>%
dplyr::count(pre) %>%
arrange(desc(n)) %>%
head(n=5)
# retrieve top5 following collocations
con_post <- kwic_con %>%
dplyr::count(post) %>%
arrange(desc(n)) %>%
head(n=5)
# normalize data frames with top5 collocations
con_pre$keyword <- keyword
con_pre$tag <- "pre"
names(con_pre)[names(con_pre) == 'pre'] <- "word"
con_post$keyword <- keyword
con_post$tag <- "post"
names(con_post)[names(con_post) == 'post'] <- "word"
con_colls10 <- rbind(con_colls10, con_pre)
con_colls10 <- rbind(con_colls10, con_post)}
And, just like before, we remove the collocations that appeared only once in the corpus (and remove noise, i.e. empty strings from the collocations).
# only keep collocations that appear more than once
top_colls_con<-con_colls10[(con_colls10$n > 1),]
# remove empty strings
top_colls_con<-top_colls_con[(top_colls_con$word > " "),]
And save the final table to a csv file.
write.csv(top_colls_con, "../output/top_collocations_con1.csv")
To retrieve the context of each compound word, we extract the concordances on a sentence level. That means, we extract a window of 5 sentences to the left and to the right of the keyword sentence. To do this, we must tokenize our data by sentences, instead of words.
Since we cannot normalize the data the same way when we tokenize it on sentence-level, we firstly create word tokens from the corpora.
# create word tokens for P2022 and C2022
p2022_tokens <- tokens(pro2022, remove_punct = FALSE, remove_symbols = TRUE,
remove_numbers = TRUE, remove_url = TRUE, remove_separators = TRUE)
c2022_tokens <- tokens(contra2022, remove_punct = FALSE, remove_symbols = TRUE,
remove_numbers = TRUE, remove_url = TRUE, remove_separators = TRUE)
To these tokens, we apply a normalization step where we remove hyphens within words, such as “Klima-Skeptiker” to convert it to “Klimaskeptiker”.
# remove hyphens from tokens
# convert to tokens
#p2022_toks_cleaned <- as.tokens(p2022_tokens)
# replace multi-token sequences with a "compound" token
#toks_comp <- tokens_compound(p2022_toks_cleaned, phrase("*-*"), concatenator ="")
toks_comp_p <- tokens_compound(p2022_tokens, phrase("*-*"), concatenator ="")
# get tokens containing the hyphen
toks_hyphenated_p <- grep("\\w+-\\w+", types(toks_comp_p), value = TRUE)
# replace the hyphenated tokens by versions without hyphen
p2022_toks_cleaned <- tokens_replace(toks_comp_p, toks_hyphenated_p, gsub("-", "", toks_hyphenated_p))
# convert to tokens
#c2022_toks_cleaned <- as.tokens(c2022_tokens)
#toks_comp <- tokens_compound(c2022_toks_cleaned, phrase("*-*"), concatenator ="")
toks_comp_c <- tokens_compound(c2022_tokens, phrase("*-*"), concatenator ="")
# get tokens containing the hyphen
toks_hyphenated_c <- grep("\\w+-\\w+", types(toks_comp_c), value = TRUE)
# replace the hyphenated tokens by versions without hyphen
c2022_toks_cleaned <- tokens_replace(toks_comp_c, toks_hyphenated_c, gsub("-", "", toks_hyphenated_c))
# merge tokens back into corpus object
p2022_merged_toks <- corpus(sapply(p2022_toks_cleaned, paste, collapse = " "))
c2022_merged_toks <- corpus(sapply(c2022_toks_cleaned, paste, collapse = " "))
Now we can create sentence tokens for both corpora.
# create "sentence" tokens for P2022 and C2022 corpus
p2022_sentences <- tokens(p2022_merged_toks, remove_punct = FALSE, remove_symbols = TRUE,
remove_numbers = TRUE, remove_url = TRUE, remove_separators = TRUE,
what = "sentence")
c2022_sentences <- tokens(c2022_merged_toks, remove_punct = FALSE, remove_symbols = TRUE,
remove_numbers = TRUE, remove_url = TRUE, remove_separators = TRUE,
what = "sentence")
# create a data frame from tokens containing 5 sentences before and after the keyword
### DO FOR p2022 ####
kwic_pro_sent.df <- data.frame(matrix(ncol = 7, nrow = 0))
kwiclist_sent_pro <- list()
# for each compound word
for (word in compounds)
{
# retrieve sentences before/after keyword
context_pro_sent <- kwic(p2022_sentences, word, valuetype="regex", window=5)
kwiclist_sent_pro[[word]] <- context_pro_sent # save to list
}
kwic_pro_sent.df = do.call(rbind, kwiclist_sent_pro) # save to final data frame
### DO FOR c2022 ###
kwic_con_sent.df <- data.frame(matrix(ncol = 7, nrow = 0))
kwiclist_sent_con <- list()
# for each compound word
for (word in compounds)
{
# retrieve sentences before/after keyword
context_con_sent <- kwic(c2022_sentences, word, valuetype="regex", window=5)
kwiclist_sent_con[[word]] <- context_con_sent # save to list
}
kwic_con_sent.df = do.call(rbind, kwiclist_sent_con) # save to final data frame
kwic_pro_sent.df
Keyword-in-context with 803 matches.
kwic_con_sent.df
Keyword-in-context with 1,946 matches.
[ reached max_nrow ... 946 more matches ]
# save to csv file
write.csv(kwic_pro_sent.df,"../output/pro_context_new.csv", row.names = FALSE)
write.csv(kwic_con_sent.df,"../output/con_context_new.csv", row.names = FALSE)
kwic(c2022_merged_toks, pattern="klimaanbeter", window=1, valuetype="regex")
kwic(c2022_sentences, pattern="klimabank", window=1, valuetype="regex")
Now, we will retrieve the collocations for each compound word. The collocations are then saved to a data frame, one for each corpus, and exported to a csv file, such that we can also use the data in Python
### C2022
# initiate empty data frame for C2022
collocations_con = data.frame(docname=character(),
from=integer(),
to=integer(),
pre=logical(),
keyword=character(),
post=character(),
pattern=factor())
# for each compound word
for (word in compounds){
# initiate empty data frame
colls = data.frame()
# look up collocations
colls <- kwic(sp_c2022_tokens, pattern=word, window=1, valuetype="fixed") %>%
as_tibble()
# save to data frame
collocations_con <- rbind(collocations_con, colls)}
### P2022
# initiate empty data frame for P2022
collocations_pro = data.frame(docname=character(),
from=integer(),
to=integer(),
pre=logical(),
keyword=character(),
post=character(),
pattern=factor())
# for each compound
for (word in compounds){
# initiate empty data frame
colls = data.frame()
# look up collocations
colls <- kwic(sp_p2022_tokens, pattern=word, window=1, valuetype="fixed") %>%
as_tibble()
# save to data frame
collocations_pro <- rbind(collocations_pro, colls)}
Please run the following lines to save the output to a csv file.
#write.csv(collocations_con, "../output/collocations_con.csv")
#write.csv(collocations_pro, "../output/collocations_pro.csv")
# for each compound
for (word_form in compound_df$compound_forms){
word = c(unlist_forms(word_form)) # turn into correct format
original <- compound_df[compound_df$compound_forms %like% word[[1]], ]$original[[1]]
lemma <- rep(original, length(word))
# replace string in tokens with lemma form (for pro2000 and contra2000)
pro2000_tokens <- tokens_replace(pro2000_tokens, word, lemma, valuetype = "fixed")
contra2000_tokens <- tokens_replace(contra2000_tokens, word, lemma, valuetype = "fixed")}
# function to preprocess compounds data frame
# this function unnests the list of word forms for each compound and creates a list containing all potential word forms
unlist_forms = function(word){
x <- unlist(strsplit(word, ","))
return(gsub(" ","",x))}
# apply the function to our compounds data frame
compound_forms <- unlist_forms(compound_df$compound_forms)
#for (word in compound_forms){
# print(word)
#}
compound_forms